// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License. 

#include "dspm_mult_platform.h"
#if (dspm_mult_f32_arp4_enabled == 1)

// This is matrix multipliction function for ESP32 processor.
    .text
    .align  4
    .global dspm_mult_f32_arp4
    .global .dspm_mult_f32_arp4_body
    .type   dspm_mult_f32_arp4,@function
// The function implements the following C code:
// esp_err_t dspm_mult_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
    // for (int i=0 ; i< m ; i++)
    // {
    //     for (int j=0 ; j< k ; j++)
    //     {
    //         C[i*k + j] = A[i*n]*B[j];
    //         for (int s=1; s< n ; s++)
    //         {
    //             C[i*k + j] += A[i*n + s]*B[s*k + j];
    //         }
    //     }
    // }
//     return ESP_OK;
// }

dspm_mult_f32_arp4: 
// A - a2: a0
// B - a3: a1
// C - a4: a2
// m - a5: a3
// n - a6: a4
// k - a7: a5

// a8:a6  = n*4
// a10:t0 = 4
// a9:a7  - counter loop1: 0..m
// a11:t1 - counter loop2: 0..k
// a12:t2 - A
// a13:t3 - B
// a14:t4
// a15:t5

    add sp,sp,-16
    // Array increment for floating point data should be 4
.dspm_mult_f32_arp4_body:
    slli    a6, a4, 2 // Pointer increment for A
    slli    t5,a5, 2 // Pointer increment for B

    li  t4, 0 // Innitial state of accumulator f1
    li  t0, 4 // Increment = 4
    li  a7, 0  // counter loop1

.dpf_loop1:    
    li  t1, 0 // reset counter for loop2
.dpf_loop2:

        // Clear initial state of the result register
        // a2 - A
        // a3 - B
        // a6 - n
        // a10 - step == 4 bytes
        // a8 -  step n*4
        mv      t2, a0 // load A

        slli     t3, t1, 2 // loop count to pointer value
        add      t3, a1, t3 // load A

        fmv.w.x fa2,zero // reset fa2
        // Calculating dotproduct...
        esp.lp.setup    0, a4, .matrix_mul_loop
            flw     fa0, 0(t2)
            add     t2, t2, t0
            flw     fa1, 0(t3)
            fmadd.s   fa2, fa1, fa0, fa2
        .matrix_mul_loop: add       t3, t3, t5

        fsw     fa2, 0(a2)
        addi    a2, a2, 4 // increment a2 for next time
        // check loop 2
        addi  t1, t1, 1 // Increment loop2 counter
        blt   t1, a5, .dpf_loop2

    // check loop 1
    add   a0, a0, a6 // Increment A, A = A[i*n]

    add   a7, a7, 1 // Increment loop1 counter
    blt   a7, a3, .dpf_loop1

    // Exit
    mv  a0, a6      // return status ESP_OK
    add sp,sp,16
    ret

#endif //dspm_mult_f32_arp4_enabled